import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline
datadir = 'data/'
college_financials = pd.read_csv(datadir + 'college_financials.csv', header=0)
college_financials = college_financials[(college_financials['sector'] != 0) & (college_financials['sector'] != 99)]
print('college_financials has {} records'.format(len(college_financials)))
college_financials.describe().transpose()
for c in college_financials.columns:
df_temp = college_financials.copy()[college_financials[c].notnull()]
need_log = False
if df_temp[c].dtype in (object, np.int64):
continue
n = 0
if (np.abs(df_temp[c].max() / df_temp[c].mean()) > 3):
df_temp[c] = df_temp[c].apply(lambda x: np.sign(x) * np.log10(np.abs(x)) if not x == 0 else 0)
n += 1
plt.subplots(figsize=(16,8))
sns.violinplot(x = "sector", y = c, data = df_temp, dropna=True)
plt.title(c + ('(log {} times)'.format(n) if n > 0 else '') + 'datacount: ' + str(len(df_temp)/len(college_financials)))
plt.savefig('exploration/college_financials/' + c + '.pdf')
After initial examination of the data, it looks like overall quality of the data is inferior in sectors 0 and 99 where there are duplicates (in the case of 2010 data for Aveda Institute Chapel Hill) and the overall count is a very small percentage of all the data. so we are going to remove all sectors in 0 and 99
pd.value_counts(college_financials['sector']).sort_index().plot.bar()
plt.title('count by sector')
pd.value_counts(college_financials['academic_year']).sort_index().plot.bar()
plt.title('count by academic_year')
year_dict = {1:4, 2:4, 3:4, 4:2, 5:2, 6:2, 7:1, 8:1, 9:1, 0:0, 99:0}
college_financials['years'] = college_financials['sector'].apply(lambda x: year_dict[x])
for c in college_financials.columns:
df_temp = college_financials.copy()[college_financials[c].notnull()]
need_log = False
if df_temp[c].dtype in (object, np.int64):
continue
n = 0
if (np.abs(df_temp[c].max() / df_temp[c].mean()) > 3):
df_temp[c] = df_temp[c].apply(lambda x: np.sign(x) * np.log10(np.abs(x)) if not x == 0 else 0)
n += 1
plt.subplots(figsize=(16,8))
sns.violinplot(x = "years", y = c, data = df_temp, dropna=True)
plt.title(c + ('(log {} times)'.format(n) if n > 0 else '') + 'datacount: ' + str(len(df_temp)/len(college_financials)))
plt.savefig('exploration/college_financials/' + c + '.pdf')
len(college_financials)